{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from nms_cupy import cupy_call_nms_kernel,_load_kernel,_nms_gpu_code \n",
    "import numpy as np\n",
    "import cupy as cp\n",
    "from nms_numba_cpu import nms_cpu\n",
    "from nms_numba import nms_kernel,numba_call_nms_kernel\n",
    "from numba import cuda\n",
    "thresh=0.7"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1.ubuntu 18    \n",
    "2.python 3.7    \n",
    "3.cuda   9.2    \n",
    "4.gpu  2080ti    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1.numba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The slowest run took 225.82 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
      "24.4 ms ± 21.7 ms per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "bbox=np.load(\"bbox.npy\")\n",
    "n_bbox = bbox.shape[0]\n",
    "threads_per_block = 64\n",
    "col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)\n",
    "blocks = (col_blocks, col_blocks, 1)\n",
    "threads = (threads_per_block, 1, 1)\n",
    "\n",
    "mask_dev = cuda.device_array((n_bbox * col_blocks,), dtype=np.uint64)\n",
    "bbox=bbox.reshape(-1)\n",
    "bbox =cuda.to_device(bbox)\n",
    "\n",
    "%timeit nms_kernel[blocks, threads](n_bbox, bbox, mask_dev)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "63.7 ms ± 458 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "bbox=np.load(\"bbox.npy\")\n",
    "%timeit numba_call_nms_kernel(bbox,0.7)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2.cupy #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5.73 ms ± 19.8 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "bbox=np.load(\"bbox.npy\")\n",
    "bbox=cp.asarray(bbox)\n",
    "threads_per_block = 64\n",
    "col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)\n",
    "blocks = (col_blocks, col_blocks, 1)\n",
    "threads = (threads_per_block, 1, 1)\n",
    "\n",
    "mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)\n",
    "bbox = cp.ascontiguousarray(bbox, dtype=np.float32)\n",
    "kern = _load_kernel('nms_kernel', _nms_gpu_code)\n",
    "%timeit kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),bbox, mask_dev))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "18.5 ms ± 435 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "bbox=np.load(\"bbox.npy\")\n",
    "bbox=cp.asarray(bbox)\n",
    "%timeit cupy_call_nms_kernel(bbox,0.7)\n",
    "# print(n_selec.get())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3.numba_cpu #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "61.9 ms ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "bbox=np.load(\"bbox.npy\")\n",
    "%timeit keep=nms_cpu(bbox,0.7)\n",
    "# print(keep.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4.pybind11 cuda"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "26.1 ms ± 551 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "import nms_cuda \n",
    "import numpy as np\n",
    "\n",
    "bbox=np.load(\"bbox.npy\")\n",
    "n_bbox = bbox.shape[0]\n",
    "\n",
    "keep_out=np.zeros(bbox.shape[0],dtype=np.int32)\n",
    "%timeit n=nms_cuda.nms_cuda(keep_out,bbox)\n",
    "# print(keep_out)\n",
    "# print(n)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
